imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import torch

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics 

# embedding 
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부     
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G


def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")   
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")  
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")

    return G
    
    
def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def embedding(Graph):
    # Graph -> X (feature)
    _edgs = list(Graph.edges)
    subGraph = Graph.edge_subgraph([_edgs[x] for x in range(len(Graph.edges))]).copy()
    subGraph.add_nodes_from(list(set(Graph.nodes) - set(subGraph.nodes)))    
    embedded = AverageEmbedder(Node2Vec(subGraph, weight_key='weight').fit(window=10).wv)
    X = [embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in range(len(Graph.edges))]
    # Graph -> y (label)
    y = np.array(list(nx.get_edge_attributes(Graph, "label").values()))
    return X,y 

def anal(df):
    Graph = build_graph_bipartite(df)
    X,XX,y,yy = embedding(Graph)
    lrnr = RandomForestClassifier(n_estimators=100, random_state=42) 
    lrnr.fit(X,y)
    yyhat = lrnr.predict(XX)
    df = pd.DataFrame({
        'acc':[sklearn.metrics.accuracy_score(yy,yyhat)], 
        'pre':[sklearn.metrics.precision_score(yy,yyhat)], 
        'rec':[sklearn.metrics.recall_score(yy,yyhat)],
        'f1':[sklearn.metrics.f1_score(yy,yyhat)]}
    )    
    return df

def our_sampling1(df):
    cus_list = set(df.query('is_fraud==1').cc_num.tolist())
    return df.query("cc_num in @ cus_list")

fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]

fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain

	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	city	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long	is_fraud
0	2019-01-01 00:00:00	2.703190e+15	fraud_Rippin, Kub and Mann	misc_net	4.97	Jennifer	Banks	F	561 Perry Cove	Moravian Falls	...	36.0788	-81.1781	3495	Psychologist, counselling	1988-03-09	0b242abb623afc578575680df30655b9	1325376018	36.011293	-82.048315	0
1	2019-01-01 00:00:00	6.304230e+11	fraud_Heller, Gutmann and Zieme	grocery_pos	107.23	Stephanie	Gill	F	43039 Riley Greens Suite 393	Orient	...	48.8878	-118.2105	149	Special educational needs teacher	1978-06-21	1f76529f8574734946361c461b024d99	1325376044	49.159047	-118.186462	0
2	2019-01-01 00:00:00	3.885950e+13	fraud_Lind-Buckridge	entertainment	220.11	Edward	Sanchez	M	594 White Dale Suite 530	Malad City	...	42.1808	-112.2620	4154	Nature conservation officer	1962-01-19	a1a22d70485983eac12b5b88dad1cf95	1325376051	43.150704	-112.154481	0
3	2019-01-01 00:01:00	3.534090e+15	fraud_Kutch, Hermiston and Farrell	gas_transport	45.00	Jeremy	White	M	9443 Cynthia Court Apt. 038	Boulder	...	46.2306	-112.1138	1939	Patent attorney	1967-01-12	6b849c168bdad6f867558c3793159a81	1325376076	47.034331	-112.561071	0
4	2019-01-01 00:03:00	3.755340e+14	fraud_Keeling-Crist	misc_pos	41.96	Tyler	Garcia	M	408 Bradley Rest	Doe Hill	...	38.4207	-79.4629	99	Dance movement psychotherapist	1986-03-28	a41d7549acf90789359a9aa5346dcb46	1325376186	38.674999	-78.632459	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1048570	2020-03-10 16:07:00	6.011980e+15	fraud_Fadel Inc	health_fitness	77.00	Haley	Wagner	F	05561 Farrell Crescent	Annapolis	...	39.0305	-76.5515	92106	Accountant, chartered certified	1943-05-28	45ecd198c65e81e597db22e8d2ef7361	1362931649	38.779464	-76.317042	0
1048571	2020-03-10 16:07:00	4.839040e+15	fraud_Cremin, Hamill and Reichel	misc_pos	116.94	Meredith	Campbell	F	043 Hanson Turnpike	Hedrick	...	41.1826	-92.3097	1583	Geochemist	1999-06-28	c00ce51c6ebb7657474a77b9e0b51f34	1362931670	41.400318	-92.726724	0
1048572	2020-03-10 16:08:00	5.718440e+11	fraud_O'Connell, Botsford and Hand	home	21.27	Susan	Mills	F	005 Cody Estates	Louisville	...	38.2507	-85.7476	736284	Engineering geologist	1952-04-02	17c9dc8b2a6449ca2473726346e58e6c	1362931711	37.293339	-84.798122	0
1048573	2020-03-10 16:08:00	4.646850e+18	fraud_Thompson-Gleason	health_fitness	9.52	Julia	Bell	F	576 House Crossroad	West Sayville	...	40.7320	-73.1000	4056	Film/video editor	1990-06-25	5ca650881b48a6a38754f841c23b77ab	1362931718	39.773077	-72.213209	0
1048574	2020-03-10 16:08:00	2.283740e+15	fraud_Buckridge PLC	misc_pos	6.81	Shannon	Williams	F	9345 Spencer Junctions Suite 183	Alpharetta	...	34.0770	-84.3033	165556	Prison officer	1997-12-27	8d0a575fe635bbde12f1a2bffc126731	1362931730	33.601468	-83.891921	0

1048575 rows × 22 columns

시도

_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape

(214520, 22)

214520*214520

46018830400

df02.is_fraud.mean().round(5)

0.028

사기거래 빈도..

df02 = df02.reset_index()

N = len(df02)

tr/test

df02_tr,df02_test = sklearn.model_selection.train_test_split(df02, random_state=42)

df02_tr.is_fraud.mean().round(5), df02_test.is_fraud.mean().round(5)

(0.02757, 0.02927)

df02_tr.shape, df02_test.shape

((160890, 23), (53630, 23))

train_mask = np.concatenate((np.full(160890, True), np.full(53630, False)))
test_mask = np.concatenate((np.full(160890, False), np.full(53630, True)))
print("Train Mask:", train_mask)
print("Test Mask:", test_mask)

Train Mask: [ True  True  True ... False False False]
Test Mask: [False False False ...  True  True  True]

train_mask.sum(), test_mask.sum()

(160890, 53630)

df02_com = pd.concat([df02_tr, df02_test])

df02_com = df02_com.reset_index()

np.save('df02_com.npy', df02_com)

df02_com

	level_0	index	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long	is_fraud
0	176322	944206	2020-01-12 14:26:00	1.800680e+14	fraud_Durgan, Gislason and Spencer	home	83.42	Mary	Juarez	F	...	42.9385	-88.3950	2328	Applications developer	1942-01-06	dac0ad2e6b9956237cdca85beea4b422	1358000819	43.301471	-88.731241	0
1	57361	305252	2019-05-27 23:22:00	4.158950e+15	fraud_Douglas-White	entertainment	119.90	Justin	Bell	M	...	40.4308	-79.9205	687276	Scientist, marine	1973-10-19	6660431462def289ceb3e176e88f58e5	1338160935	40.673836	-80.710911	0
2	76922	326443	2019-06-04 19:27:00	3.040770e+13	fraud_Bernier and Sons	kids_pets	47.11	Danielle	Evans	F	...	42.1939	-76.7361	520	Psychotherapist	1991-10-13	3e0fdbbb80e5e068e5873be2a539cc24	1338838050	42.298622	-77.473862	0
3	73661	515686	2019-08-11 09:04:00	4.319580e+18	fraud_Kutch LLC	gas_transport	56.51	Kathleen	Nash	F	...	37.1788	-82.6950	502	Chief Financial Officer	1960-02-01	66c331ada80949f23b6eb54a2a805b30	1344675872	37.867947	-83.096063	0
4	149325	217309	2019-04-20 21:16:00	6.041621e+10	fraud_Beer-Jast	kids_pets	1.42	Mary	Diaz	F	...	43.0048	-108.8964	1645	Information systems manager	1986-02-17	73d345383dacf28ddb303df878af6034	1334956594	43.454507	-109.492721	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
214515	209807	138275	2019-03-16 22:16:00	6.761020e+11	fraud_Crooks and Sons	personal_care	23.07	Natasha	Mclaughlin	F	...	38.4549	-122.2564	94014	Airline pilot	1985-08-21	f5ed056d58e7cafe35201991383d5af7	1331936181	38.440172	-121.930178	1
214516	116046	108839	2019-03-03 16:55:00	3.453890e+14	fraud_Dooley Inc	shopping_pos	2.57	Justin	Fowler	M	...	33.9215	-89.6782	3451	Financial trader	1984-05-19	617d850e784d23f70bdbf99aec16877d	1330793701	34.248988	-88.691253	0
214517	84374	244548	2019-05-02 21:02:00	4.170690e+15	fraud_Hettinger, McCullough and Fay	home	15.40	Samuel	Frey	M	...	35.6665	-97.4798	116001	Media buyer	1993-05-10	6e7be41baa4854c8c122124573ab826a	1335992520	34.789935	-96.704044	0
214518	766	968783	2020-01-26 17:33:00	4.477160e+18	fraud_Jast Ltd	shopping_net	66.19	Angela	Ross	F	...	40.3928	-111.7941	42384	Futures trader	1992-12-29	823cb59773e114b5de50c51ce520f181	1359221619	40.382572	-111.342788	0
214519	18374	929851	2020-01-04 13:22:00	4.742880e+18	fraud_Kihn-Schuster	food_dining	30.57	Cassandra	Sanders	F	...	20.0271	-155.3697	1490	Scientist, research (maths)	1991-04-13	a0921bbdc96d65bfcfdb78b369233303	1357305750	21.010877	-155.079405	0

214520 rows × 24 columns

데이터 돌아가는중..

import time

t1= time.time()
edge_index_list_plus = []
_cc_num = np.array(df02_com['cc_num'])
_trans_date_trans_time=np.array(df02_com['trans_date_trans_time'].apply(lambda x: x.value))
for i in range(N):
    for j in range(N):
        if _cc_num[i] != _cc_num[j]:  # cc_num 값이 다르다면
            time_difference = 0
        else:
            time_difference = abs(_trans_date_trans_time[i] - _trans_date_trans_time[j])
        edge_index_list_plus.append([i, j, time_difference])
edge_index_list_plus_nparr =np.array(edge_index_list_plus)
np.save('edge_index_list_plus_02.npy', edge_index_list_plus_nparr)
t2= time.time()
t2-t1

7.3492796421051025

데이터 돌아가는중…………………. 다시다시

edge_index = np.array(edge_index_list_plus)

edge_index.shape

(144288144, 3)

edge_index

array([[0.0000e+00, 0.0000e+00, 0.0000e+00],
       [0.0000e+00, 1.0000e+00, 0.0000e+00],
       [0.0000e+00, 2.0000e+00, 0.0000e+00],
       ...,
       [1.2011e+04, 1.2009e+04, 0.0000e+00],
       [1.2011e+04, 1.2010e+04, 0.0000e+00],
       [1.2011e+04, 1.2011e+04, 0.0000e+00]])

edge_index[:,2] = np.abs(edge_index[:,2])
theta = edge_index[:,2].mean()
theta

10973.519989002007

edge_index[:,2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)
edge_index

array([[0.0000e+00, 0.0000e+00, 0.0000e+00],
       [0.0000e+00, 1.0000e+00, 0.0000e+00],
       [0.0000e+00, 2.0000e+00, 0.0000e+00],
       ...,
       [1.2011e+04, 1.2009e+04, 0.0000e+00],
       [1.2011e+04, 1.2010e+04, 0.0000e+00],
       [1.2011e+04, 1.2011e+04, 0.0000e+00]])

edge_index_list_updated = edge_index.tolist()
np.array(edge_index_list_updated)[:,2].mean()

8.443606280313275e-05

mm = np.array(edge_index_list_updated)[:,2].mean()

시간이 평균보다 짧다면? . 음..

selected_edges = [(int(row[0]), int(row[1])) for row in edge_index_list_updated if row[2] > mm]

edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()

edge_index_selected.shape

torch.Size([2, 51392])

data설정(x, edge_index, y)

x = df50_com['amt']

a = torch.tensor(x, dtype=torch.float)

a = a.reshape(-1,1)
a

tensor([[921.2400],
        [698.2800],
        [220.5600],
        ...,
        [ 17.9700],
        [  7.5800],
        [824.9900]])

y = df50_com['is_fraud']

b = torch.tensor(y,dtype=torch.int64)

tensor([1, 1, 0,  ..., 1, 0, 1])

import torch_geometric

data = torch_geometric.data.Data(x=a, edge_index = edge_index_selected, y=b, train_mask = train_mask, test_mask = test_mask)

data

Data(x=[12012, 1], edge_index=[2, 51392], y=[12012], train_mask=[12012], test_mask=[12012])

gnn



import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 16)
        self.conv2 = GCNConv(16,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

model = GCN()

model

GCN(
  (conv1): GCNConv(1, 16)
  (conv2): GCNConv(16, 2)
)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()

GCN(
  (conv1): GCNConv(1, 16)
  (conv2): GCNConv(16, 2)
)

for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.9321